{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据分区及建模"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>          <td>value</td>      <th>  R-squared:         </th> <td>   0.853</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.847</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   138.0</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 15 Jul 2020</td> <th>  Prob (F-statistic):</th> <td>6.39e-57</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>08:09:46</td>     <th>  Log-Likelihood:    </th> <td> -1354.1</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   150</td>      <th>  AIC:               </th> <td>   2722.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   143</td>      <th>  BIC:               </th> <td>   2743.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     6</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th> <td> 2630.7190</td> <td> 1907.483</td> <td>    1.379</td> <td> 0.170</td> <td>-1139.788</td> <td> 6401.226</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x1</th>    <td>  410.6209</td> <td>   93.913</td> <td>    4.372</td> <td> 0.000</td> <td>  224.984</td> <td>  596.257</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x2</th>    <td>   -0.0067</td> <td>    0.033</td> <td>   -0.201</td> <td> 0.841</td> <td>   -0.072</td> <td>    0.059</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x3</th>    <td>    0.0730</td> <td>    0.035</td> <td>    2.076</td> <td> 0.040</td> <td>    0.004</td> <td>    0.142</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x4</th>    <td>   -0.0197</td> <td>    0.037</td> <td>   -0.539</td> <td> 0.591</td> <td>   -0.092</td> <td>    0.053</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x5</th>    <td>    0.1035</td> <td>    0.039</td> <td>    2.654</td> <td> 0.009</td> <td>    0.026</td> <td>    0.181</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x6</th>    <td>    0.6617</td> <td>    0.060</td> <td>   11.103</td> <td> 0.000</td> <td>    0.544</td> <td>    0.780</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td> 1.331</td> <th>  Durbin-Watson:     </th> <td>   1.868</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.514</td> <th>  Jarque-Bera (JB):  </th> <td>   1.010</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.190</td> <th>  Prob(JB):          </th> <td>   0.604</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 3.132</td> <th>  Cond. No.          </th> <td>6.48e+05</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 6.48e+05. This might indicate that there are<br/>strong multicollinearity or other numerical problems."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                  value   R-squared:                       0.853\n",
       "Model:                            OLS   Adj. R-squared:                  0.847\n",
       "Method:                 Least Squares   F-statistic:                     138.0\n",
       "Date:                Wed, 15 Jul 2020   Prob (F-statistic):           6.39e-57\n",
       "Time:                        08:09:46   Log-Likelihood:                -1354.1\n",
       "No. Observations:                 150   AIC:                             2722.\n",
       "Df Residuals:                     143   BIC:                             2743.\n",
       "Df Model:                           6                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const       2630.7190   1907.483      1.379      0.170   -1139.788    6401.226\n",
       "x1           410.6209     93.913      4.372      0.000     224.984     596.257\n",
       "x2            -0.0067      0.033     -0.201      0.841      -0.072       0.059\n",
       "x3             0.0730      0.035      2.076      0.040       0.004       0.142\n",
       "x4            -0.0197      0.037     -0.539      0.591      -0.092       0.053\n",
       "x5             0.1035      0.039      2.654      0.009       0.026       0.181\n",
       "x6             0.6617      0.060     11.103      0.000       0.544       0.780\n",
       "==============================================================================\n",
       "Omnibus:                        1.331   Durbin-Watson:                   1.868\n",
       "Prob(Omnibus):                  0.514   Jarque-Bera (JB):                1.010\n",
       "Skew:                           0.190   Prob(JB):                        0.604\n",
       "Kurtosis:                       3.132   Cond. No.                     6.48e+05\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 6.48e+05. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import statsmodels.api as sm\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "\n",
    "pdata=pd.read_csv(\"data/pdata.csv\")\n",
    "pdata = pdata.reset_index()\n",
    "pdata = pdata.drop(columns='index')\n",
    "train_set = pdata.loc[0:149,]\n",
    "test_set = pdata.loc[149:,]\n",
    "test_set.to_csv(\"data/test_set.csv\", index=False)\n",
    "\n",
    "x = np.column_stack((train_set.month,train_set.r1_value,train_set.r4_value,\n",
    "                     train_set.r6_value,train_set.r8_value,train_set.r12_value))\n",
    "X = sm.add_constant(x)\n",
    "model = sm.OLS(train_set.value, X).fit()\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "移除掉x2和x4，重新构建模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>          <td>value</td>      <th>  R-squared:         </th> <td>   0.852</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.848</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   209.3</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 15 Jul 2020</td> <th>  Prob (F-statistic):</th> <td>3.65e-59</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>08:09:46</td>     <th>  Log-Likelihood:    </th> <td> -1354.3</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   150</td>      <th>  AIC:               </th> <td>   2719.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   145</td>      <th>  BIC:               </th> <td>   2734.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     4</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th> <td> 1741.1626</td> <td> 1201.878</td> <td>    1.449</td> <td> 0.150</td> <td> -634.302</td> <td> 4116.627</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x1</th>    <td>  425.1202</td> <td>   86.591</td> <td>    4.910</td> <td> 0.000</td> <td>  253.978</td> <td>  596.263</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x2</th>    <td>    0.0770</td> <td>    0.034</td> <td>    2.261</td> <td> 0.025</td> <td>    0.010</td> <td>    0.144</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x3</th>    <td>    0.1085</td> <td>    0.038</td> <td>    2.893</td> <td> 0.004</td> <td>    0.034</td> <td>    0.183</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x4</th>    <td>    0.6573</td> <td>    0.059</td> <td>   11.222</td> <td> 0.000</td> <td>    0.542</td> <td>    0.773</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td> 1.428</td> <th>  Durbin-Watson:     </th> <td>   1.893</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.490</td> <th>  Jarque-Bera (JB):  </th> <td>   1.106</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.200</td> <th>  Prob(JB):          </th> <td>   0.575</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 3.128</td> <th>  Cond. No.          </th> <td>3.19e+05</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 3.19e+05. This might indicate that there are<br/>strong multicollinearity or other numerical problems."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                  value   R-squared:                       0.852\n",
       "Model:                            OLS   Adj. R-squared:                  0.848\n",
       "Method:                 Least Squares   F-statistic:                     209.3\n",
       "Date:                Wed, 15 Jul 2020   Prob (F-statistic):           3.65e-59\n",
       "Time:                        08:09:46   Log-Likelihood:                -1354.3\n",
       "No. Observations:                 150   AIC:                             2719.\n",
       "Df Residuals:                     145   BIC:                             2734.\n",
       "Df Model:                           4                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const       1741.1626   1201.878      1.449      0.150    -634.302    4116.627\n",
       "x1           425.1202     86.591      4.910      0.000     253.978     596.263\n",
       "x2             0.0770      0.034      2.261      0.025       0.010       0.144\n",
       "x3             0.1085      0.038      2.893      0.004       0.034       0.183\n",
       "x4             0.6573      0.059     11.222      0.000       0.542       0.773\n",
       "==============================================================================\n",
       "Omnibus:                        1.428   Durbin-Watson:                   1.893\n",
       "Prob(Omnibus):                  0.490   Jarque-Bera (JB):                1.106\n",
       "Skew:                           0.200   Prob(JB):                        0.575\n",
       "Kurtosis:                       3.128   Cond. No.                     3.19e+05\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 3.19e+05. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = np.column_stack((train_set.month,train_set.r4_value,train_set.r8_value,train_set.r12_value))\n",
    "X = sm.add_constant(x)\n",
    "model = sm.OLS(train_set.value, X).fit()\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "加入二次项、三次项，重新建模"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>          <td>value</td>      <th>  R-squared:         </th> <td>   0.859</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.854</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   175.0</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 15 Jul 2020</td> <th>  Prob (F-statistic):</th> <td>2.42e-59</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>08:09:46</td>     <th>  Log-Likelihood:    </th> <td> -1351.0</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   150</td>      <th>  AIC:               </th> <td>   2714.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   144</td>      <th>  BIC:               </th> <td>   2732.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     5</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th> <td>   -0.1528</td> <td>    0.030</td> <td>   -5.146</td> <td> 0.000</td> <td>   -0.212</td> <td>   -0.094</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x1</th>    <td>  438.3667</td> <td>   85.160</td> <td>    5.148</td> <td> 0.000</td> <td>  270.042</td> <td>  606.691</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x2</th>    <td>    0.5917</td> <td>    0.176</td> <td>    3.371</td> <td> 0.001</td> <td>    0.245</td> <td>    0.939</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x3</th>    <td>-3.141e-05</td> <td> 1.07e-05</td> <td>   -2.938</td> <td> 0.004</td> <td>-5.25e-05</td> <td>-1.03e-05</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x4</th>    <td> 5.215e-10</td> <td>  1.8e-10</td> <td>    2.893</td> <td> 0.004</td> <td> 1.65e-10</td> <td> 8.78e-10</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x5</th>    <td>    0.1065</td> <td>    0.037</td> <td>    2.893</td> <td> 0.004</td> <td>    0.034</td> <td>    0.179</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x6</th>    <td>    0.6609</td> <td>    0.057</td> <td>   11.503</td> <td> 0.000</td> <td>    0.547</td> <td>    0.774</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td> 0.868</td> <th>  Durbin-Watson:     </th> <td>   1.867</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.648</td> <th>  Jarque-Bera (JB):  </th> <td>   0.852</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.179</td> <th>  Prob(JB):          </th> <td>   0.653</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 2.913</td> <th>  Cond. No.          </th> <td>1.65e+15</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 1.65e+15. This might indicate that there are<br/>strong multicollinearity or other numerical problems."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                  value   R-squared:                       0.859\n",
       "Model:                            OLS   Adj. R-squared:                  0.854\n",
       "Method:                 Least Squares   F-statistic:                     175.0\n",
       "Date:                Wed, 15 Jul 2020   Prob (F-statistic):           2.42e-59\n",
       "Time:                        08:09:46   Log-Likelihood:                -1351.0\n",
       "No. Observations:                 150   AIC:                             2714.\n",
       "Df Residuals:                     144   BIC:                             2732.\n",
       "Df Model:                           5                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const         -0.1528      0.030     -5.146      0.000      -0.212      -0.094\n",
       "x1           438.3667     85.160      5.148      0.000     270.042     606.691\n",
       "x2             0.5917      0.176      3.371      0.001       0.245       0.939\n",
       "x3         -3.141e-05   1.07e-05     -2.938      0.004   -5.25e-05   -1.03e-05\n",
       "x4          5.215e-10    1.8e-10      2.893      0.004    1.65e-10    8.78e-10\n",
       "x5             0.1065      0.037      2.893      0.004       0.034       0.179\n",
       "x6             0.6609      0.057     11.503      0.000       0.547       0.774\n",
       "==============================================================================\n",
       "Omnibus:                        0.868   Durbin-Watson:                   1.867\n",
       "Prob(Omnibus):                  0.648   Jarque-Bera (JB):                0.852\n",
       "Skew:                           0.179   Prob(JB):                        0.653\n",
       "Kurtosis:                       2.913   Cond. No.                     1.65e+15\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 1.65e+15. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = np.column_stack((train_set.month,train_set.r4_value,\n",
    "                     train_set.r4_value**2,\n",
    "                     train_set.r4_value**3,\n",
    "                     train_set.r8_value,\n",
    "                     train_set.r12_value))\n",
    "X = sm.add_constant(x)\n",
    "model = sm.OLS(train_set.value, X).fit()\n",
    "\n",
    "with open('data/model.pkl', 'wb') as f:\n",
    "    pickle.dump(model, f)\n",
    "\n",
    "model.summary()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
